# Notice that age 0 stands for NA
data$vict_age[data$vict_age == 0] <- NA
# age distribution
ggplot(data, aes(x = vict_age)) +
geom_histogram(binwidth = 5, fill = "#433E85FF", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Age Distribution of Victims", x = "Age", y = "Frequency")

# divide age into four categories
data$age_group <- cut(
data$vict_age,
breaks = c(-Inf, 18, 40, 60, Inf),
labels = c("juvenile", "Young adult", "Middle-aged people", "The elderly"),
right = FALSE
)
#
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group)) +
geom_bar(fill = "#25858EFF", color = "black", alpha = 0.7) +
theme_minimal() +
labs(
title = "Age Group Distribution of Victims",
x = "Age Group",
y = "Count"
)

# Crime Severity Distribution by Age Group
data$severity_label <- ifelse(data$part_1_2 == 1, "Serious", "Less Serious")
# Significant test: the relationship between different age groups and crime severity
# turn severity into factor
data$severity_label <- as.factor(data$severity_label)
# Chi-squre test
severity_age_table <- table(data$age_group, data$severity_label)
chisq_test <- chisq.test(severity_age_table)
print(chisq_test)
##
## Pearson's Chi-squared test
##
## data: severity_age_table
## X-squared = 4762.7, df = 3, p-value < 2.2e-16
# output
if (chisq_test$p.value < 0.05) {
print("Age group has a statistically significant relationship with crime severity.")
} else {
print("No significant relationship between age group and crime severity.")
}
## [1] "Age group has a statistically significant relationship with crime severity."
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group, fill = severity_label)) +
geom_bar(position = "fill", alpha = 0.7) +
theme_minimal() +
labs(
title = "Crime Severity Distribution by Age Group",
x = "Age Group",
y = "Proportion",
fill = "Crime Severity"
)

# Calculate the proportion of crime severity for each gender
# Filter out rows where vict_sex is "-" or NA
clean_data <- data[!is.na(data$vict_sex) & data$vict_sex != "-", ]
# Recode gender codes with clearer labels
clean_data <- clean_data %>%
mutate(gender_label = recode(vict_sex,
"F" = "Female",
"M" = "Male",
"H" = "Intersex/Other",
"X" = "Unknown"))
# Calculate the proportion of crime severity for each gender
severity_gender_data <- clean_data %>%
group_by(severity_label, gender_label) %>%
summarise(count = n(), .groups = "drop") %>%
complete(severity_label, gender_label, fill = list(count = 0)) %>%
group_by(gender_label) %>%
mutate(percentage = count / sum(count) * 100)
severity_gender_table <- severity_gender_data %>%
arrange(gender_label, desc(percentage))
# Display the table
kable(severity_gender_table, format = "html", caption = "Crime Severity by Gender and Percentage") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Crime Severity by Gender and Percentage
|
severity_label
|
gender_label
|
count
|
percentage
|
|
Less Serious
|
Female
|
197898
|
56.10149
|
|
Serious
|
Female
|
154852
|
43.89851
|
|
Serious
|
Intersex/Other
|
70
|
62.50000
|
|
Less Serious
|
Intersex/Other
|
42
|
37.50000
|
|
Serious
|
Male
|
237003
|
59.73325
|
|
Less Serious
|
Male
|
159766
|
40.26675
|
|
Serious
|
Unknown
|
57399
|
60.70050
|
|
Less Serious
|
Unknown
|
37162
|
39.29950
|
# Plot the pie chart with values annotated
ggplot(severity_gender_data, aes(x = "", y = percentage, fill = severity_label)) +
geom_bar(stat = "identity", width = 1, alpha = 0.7) +
coord_polar(theta = "y") +
facet_wrap(~ gender_label) + # Use the recoded gender labels
geom_text(aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 4) + # Add percentage labels
theme_minimal() +
labs(
title = "Crime Severity Distribution by Gender",
x = NULL,
y = NULL,
fill = "Crime Severity"
) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()
)

# Filter data to remove invalid or missing race entries
clean_data <- data %>%
filter(!is.na(vict_descent) & vict_descent != "-")
# Map race codes to full descriptions and group small groups as "Others"
clean_data <- clean_data %>%
mutate(
vict_descent = recode(vict_descent,
"B" = "Black", # Map "B" to "Black"
"H" = "Hispanic", # Map "H" to "Hispanic"
"W" = "White", # Map "W" to "White"
"X" = "Unknown", # Map "X" to "Unknown"
"O" = "Others", # Map "O" to "Others"
.default = "Others") # Group any unspecified codes as "Others"
)
# Step 3: Calculate the proportion of each race
race_distribution <- clean_data %>%
group_by(vict_descent) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = count / sum(count) * 100) %>%
mutate(vict_descent = ifelse(percentage < 5 | vict_descent == "Others",
"Others", vict_descent)) %>%
# Merge small groups (<5%) into "Others"
group_by(vict_descent) %>%
summarise(count = sum(count), percentage = sum(percentage), .groups = "drop")
# Recalculate totals
# Create the pie chart
ggplot(race_distribution, aes(x = "", y = percentage, fill = vict_descent)) +
geom_bar(stat = "identity", width = 1, alpha = 0.7) +
coord_polar(theta = "y") +
theme_minimal() +
labs(
title = "Racial Distribution of Victims",
x = NULL,
y = NULL,
fill = "Race"
) +
geom_text(aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 4) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()
)

# crime severity with age and gender
ggplot(data[!is.na(data$vict_age), ], aes(x = age_group, fill = severity_label)) +
geom_bar(position = "dodge", alpha = 0.7) +
facet_wrap(~ vict_sex) +
theme_minimal() +
labs(
title = "Crime Severity Distribution by Age Group and Gender",
x = "Age Group",
y = "Count",
fill = "Crime Severity"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

# Crime Severity with Age Group and Race
ggplot(data[!is.na(data$vict_age) & !is.na(data$vict_descent) & data$vict_descent != "-", ],
aes(x = age_group, fill = severity_label)) +
geom_bar(position = "fill", alpha = 0.7) +
facet_wrap(~ vict_descent) +
theme_minimal() +
labs(title = "Crime Severity by Age Group and Race",
x = "Age Group",
y = "Proportion",
fill = "Crime Severity") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

# Create a boxplot showing age distribution by area
box_age_area <- ggplot(data, aes(x = area_name, y = vict_age, fill = area_name)) +
geom_boxplot(outlier.color = "black", outlier.size = 0.5, alpha = 0.7) + # Boxplot with outliers
theme_minimal() +
labs(
title = "Age Distribution by Area",
x = "Area",
y = "Victim Age",
fill = "Area"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
legend.position = "none" # Remove legend (redundant with x-axis)
)
# Display the plot
ggplotly(box_age_area)